if(!require(pacman))
install.packages("pacman")
devtools::install_github("tidyverse/dsbox")
pacman::p_load(tidyverse,
scales,
devtools,
here,
class,
plotly,
dplyr,
caret,
BiocManager,
smotefamily,
pROC,
mclust,
factoextra,
cluster,
dbscan,
arules,
arulesViz,
)Data Mining Final Project
# Reading the dataset
hepatitis <- read_csv("HepatitisCdata.csv")# Handling the NA values by replacing them by the median of each column
hepatitis <- hepatitis %>%
mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))
# Verifying that all NA values have been handled
colSums(is.na(hepatitis)) ...1 Category Age Sex ALB ALP ALT AST
0 0 0 0 0 0 0 0
BIL CHE CHOL CREA GGT PROT
0 0 0 0 0 0
write.csv(hepatitis, "HepatitisC_Cleaned.csv", row.names = FALSE)# EDA for the dataset
# Defining a colorblind-friendly palette
color_palette <- c("#440154FF", "#3B528BFF", "#21908CFF", "#5DC863FF", "#FDE725FF")
# Creating an enhanced interactive scatterplot
p <- plot_ly(
data = hepatitis,
x = ~AST,
y = ~ALT,
type = 'scatter',
mode = 'markers',
color = ~as.factor(Category),
colors = color_palette,
marker = list(
size = 8,
opacity = 0.8
),
text = ~paste(
"Category:", case_when(
Category == 0 ~ "Blood Donor",
Category == 1 ~ "Suspect Blood Donor",
Category == 2 ~ "Hepatitis",
Category == 3 ~ "Fibrosis",
Category == 4 ~ "Cirrhosis",
TRUE ~ as.character(Category)
),
"<br>AST:", AST,
"<br>ALT:", ALT,
"<br>Age:", Age
)
) %>%
layout(
title = list(
text = "Interactive Scatterplot: AST vs ALT by Category",
font = list(size = 18)
),
xaxis = list(
title = "AST (Aspartate Transaminase)",
titlefont = list(size = 14),
zeroline = FALSE,
showgrid = FALSE
),
yaxis = list(
title = "ALT (Alanine Transaminase)",
titlefont = list(size = 14),
zeroline = FALSE,
showgrid = FALSE
),
legend = list(
title = list(text = "Category"),
font = list(size = 12),
orientation = "v",
x = 1.1,
y = 0.5,
xanchor = "left"
),
plot_bgcolor = "#FFFFFF",
paper_bgcolor = "#FFFFFF"
)
# Updating legend labels after plot creation
p <- p %>% layout(
legend = list(
title = list(text = "Category"),
font = list(size = 12),
orientation = "v",
x = 1.1,
y = 0.5,
xanchor = "left",
traceorder = "normal",
itemsizing = "constant",
labels = list(
"0" = "Blood Donor",
"1" = "Suspect Blood Donor",
"2" = "Hepatitis",
"3" = "Fibrosis",
"4" = "Cirrhosis"
)
)
)
# Displaying the plot
pClassification Analysis for the dataset Hepatitis-C - Logistic Regression
# Install and load required libraries
if (!requireNamespace("caret", quietly = TRUE)) {
install.packages("caret")
}
if (!requireNamespace("themis", quietly = TRUE)) {
install.packages("themis")
}
if (!requireNamespace("recipes", quietly = TRUE)) {
install.packages("recipes")
}
if (!requireNamespace("pROC", quietly = TRUE)) {
install.packages("pROC")
}
library(caret)
library(themis)
library(recipes)
library(pROC)
# Load the dataset
hepatitis <- read.csv("HepatitisC_Cleaned.csv")
# Convert 'Category' into a binary variable (e.g., "Blood Donor" vs. others)
hepatitis <- hepatitis %>%
mutate(BinaryCategory = ifelse(Category == "0=Blood Donor", 1, 0))
# Encode 'Sex' as a numeric variable (e.g., "m" -> 1, "f" -> 0)
hepatitis <- hepatitis %>%
mutate(Sex = ifelse(Sex == "m", 1, 0))
# Exclude unnecessary columns
hepatitis <- hepatitis %>%
select(-...1, -Category) # Remove index and original 'Category'
# Ensure the target variable is a factor
hepatitis$BinaryCategory <- as.factor(hepatitis$BinaryCategory)
# Split the dataset into training and testing sets
set.seed(123) # For reproducibility
train_index <- createDataPartition(hepatitis$BinaryCategory, p = 0.8, list = FALSE)
train_data <- hepatitis[train_index, ]
test_data <- hepatitis[-train_index, ]
# Create a recipe for SMOTE
smote_recipe <- recipe(BinaryCategory ~ ., data = train_data) %>%
step_smote(BinaryCategory, over_ratio = 1) # Balance classes to a 1:1 ratio
# Prepare the SMOTE dataset
smote_data <- prep(smote_recipe) %>%
bake(new_data = NULL)
# Check the class distribution after SMOTE
print("Class distribution after applying SMOTE:")[1] "Class distribution after applying SMOTE:"
print(table(smote_data$BinaryCategory))
0 1
427 427
# Train logistic regression on the SMOTE-balanced dataset
smote_logistic_model <- glm(BinaryCategory ~ ., data = smote_data, family = "binomial")
# Summarize the model
print("Logistic Regression Model Summary:")[1] "Logistic Regression Model Summary:"
print(summary(smote_logistic_model))
Call:
glm(formula = BinaryCategory ~ ., family = "binomial", data = smote_data)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 6.246494 3.005821 2.078 0.037697 *
Age 0.055334 0.019529 2.833 0.004605 **
Sex 1.277596 0.462124 2.765 0.005699 **
ALB 0.170787 0.051974 3.286 0.001016 **
ALP 0.091808 0.011753 7.812 5.65e-15 ***
ALT 0.033121 0.007777 4.259 2.06e-05 ***
AST -0.163989 0.021844 -7.507 6.04e-14 ***
BIL -0.084453 0.025373 -3.328 0.000873 ***
CHE -0.239305 0.104772 -2.284 0.022368 *
CHOL 0.646587 0.210365 3.074 0.002115 **
CREA -0.024328 0.004495 -5.412 6.22e-08 ***
GGT -0.038332 0.006404 -5.986 2.15e-09 ***
PROT -0.182912 0.040763 -4.487 7.22e-06 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1183.90 on 853 degrees of freedom
Residual deviance: 251.74 on 841 degrees of freedom
AIC: 277.74
Number of Fisher Scoring iterations: 9
# Predict on the test set
smote_predictions <- predict(smote_logistic_model, newdata = test_data, type = "response")
# Convert probabilities to class labels (threshold = 0.5)
smote_predicted_classes <- ifelse(smote_predictions > 0.5, 1, 0)
# Evaluate the model
smote_confusion_matrix <- confusionMatrix(as.factor(smote_predicted_classes), as.factor(test_data$BinaryCategory))
print("Confusion Matrix after applying SMOTE:")[1] "Confusion Matrix after applying SMOTE:"
print(smote_confusion_matrix)Confusion Matrix and Statistics
Reference
Prediction 0 1
0 14 7
1 2 99
Accuracy : 0.9262
95% CI : (0.8646, 0.9657)
No Information Rate : 0.8689
P-Value [Acc > NIR] : 0.03364
Kappa : 0.7142
Mcnemar's Test P-Value : 0.18242
Sensitivity : 0.8750
Specificity : 0.9340
Pos Pred Value : 0.6667
Neg Pred Value : 0.9802
Prevalence : 0.1311
Detection Rate : 0.1148
Detection Prevalence : 0.1721
Balanced Accuracy : 0.9045
'Positive' Class : 0
# Plot ROC curve for the SMOTE model
smote_roc_curve <- roc(as.numeric(test_data$BinaryCategory), smote_predictions)
plot(smote_roc_curve, col = "red", main = "ROC Curve for Logistic Regression (SMOTE)")Interpretation of the classification - Logistic Regression
Understanding important terms before proceeding…
What is SMOTE? Tried to keep it as easy as possible…
Imagine you’re teaching a class where there are 90 boys and only 10 girls, and you’re trying to predict something about them (e.g., who will pass an exam). Since there are so many more boys than girls, most of your predictions might focus on boys, and you’d struggle to give fair attention to the girls. This is called class imbalance.
SMOTE is like a clever trick to “even the playing field.” It helps balance the boys and girls (or majority and minority groups in your data) by creating new, synthetic data points for the smaller group (girls, in this case) instead of just copying the existing ones.
How Does SMOTE Work? (Super Simple Example) Original Situation:
You have 10 girls (minority group). Instead of just copying the same girls, SMOTE creates “new girls” based on the ones you already have. These aren’t exact copies but are slightly adjusted versions that look realistic.
How It Creates New Data:
Imagine you know two girls, Alice and Bella, who both scored well on the last test. SMOTE “imagines” a new girl, “Cathy,” whose score is somewhere between Alice’s and Bella’s scores. Cathy is not real but synthetic and helps give the smaller group more representation. Result After SMOTE:
Now, instead of 90 boys and 10 girls, you might have 90 boys and 50 girls (original 10 plus 40 synthetic girls). This balanced data gives your model a fair chance to learn about both groups.
Sensitivity: How well you catch actual threats (e.g., detecting weapons). High sensitivity means you don’t miss any weapons.
Specificity: How well you avoid stopping harmless people (e.g., avoiding false alarms for innocent passengers). High specificity means you don’t wrongly suspect harmless people.
What Does This Visualization Show? This is a ROC Curve (Receiver Operating Characteristic Curve). It’s a graph that helps us understand how well our model is doing at separating two groups:
Blood Donors (Positive Class: 1) Non-Donors (Negative Class: 0) The red curve represents the performance of the model. It shows how good the model is at balancing two things:
Sensitivity (True Positives): How many “Blood Donors” the model correctly identifies. Specificity (True Negatives): How many “Non-Donors” the model correctly identifies. The diagonal gray line is a “coin toss” or random guess. If the red curve were close to this line, the model would be as good as random guessing.
What Question Does This Visualization Answer? “How well can this model distinguish between Blood Donors and Non-Donors?” It shows how accurately the model predicts who is a donor and who is not, across all possible thresholds (cut-off points for deciding whether someone is classified as a donor).
How to Interpret This Curve in Simple Terms The red curve is very close to the top-left corner.
Top-left corner means perfect performance: it catches all donors without falsely labeling “Non-Donors” as donors. Our model does an excellent job of separating the two groups. The steep rise near the beginning tells us:
The model identifies most “Blood Donors” (Sensitivity is high) with very few mistakes early on. High specificity and balanced accuracy:
The flat section at the top shows the model continues to correctly classify “Non-Donors” even as it adjusts thresholds. Overall, this curve shows the model is doing much better than guessing (red curve far above the diagonal line).
Why Is This Important?
This visualization helps answer:
“How reliable is this model?” The red curve being far from the gray line means the model is reliable.
“Does the model handle Blood Donors (minority class) well?” Yes, because the curve rises steeply, showing it catches “Blood Donors” early and accurately.
“Should we trust the model’s predictions?”
Yes, because the curve indicates that the model balances errors well between identifying donors and non-donors.
This graph tells us:
The model can distinguish between donors and non-donors accurately. It catches most donors (87.5%) while making very few mistakes. The model is better than random guessing and does a good job even with imbalanced data, thanks to SMOTE.
library(ggplot2)
test_data$Predicted_Probabilities <- smote_predictions
ggplot(test_data, aes(x = Predicted_Probabilities, fill = BinaryCategory)) +
geom_density(alpha = 0.5) +
labs(title = "Probability Distribution for Logistic Regression",
x = "Predicted Probability",
y = "Density",
fill = "Actual Class") +
theme_minimal()coef_df <- as.data.frame(summary(smote_logistic_model)$coefficients)
coef_df <- coef_df[-1, ] # Exclude intercept
coef_df$Feature <- rownames(coef_df)
ggplot(coef_df, aes(x = reorder(Feature, abs(Estimate)), y = Estimate)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "Feature Importance in Logistic Regression",
x = "Feature",
y = "Coefficient Value") +
theme_minimal()ggplot(smote_data, aes(x = ALT, y = AST, color = BinaryCategory)) +
geom_point() +
stat_smooth(method = "glm", method.args = list(family = "binomial"), se = FALSE) +
labs(title = "Decision Boundary of Logistic Regression",
x = "ALT",
y = "AST") +
theme_minimal()GMM Clustering
# Step 1: Load the dataset
# Assuming the data has been preprocessed (scaled, PCA applied, and features weighted)
hepatitis_data <- read.csv("HepatitisC_Cleaned.csv")
# Select numeric columns for clustering
numeric_data <- hepatitis_data[sapply(hepatitis_data, is.numeric)]
scaled_data <- scale(numeric_data)
# Perform PCA for dimensionality reduction
pca <- prcomp(scaled_data, center = TRUE, scale. = TRUE)
pca_data <- pca$x[, 1:2] # Retain the first two principal components
# Step 2: Apply Gaussian Mixture Models (GMM)
set.seed(123) # Ensure reproducibility
gmm_model <- Mclust(pca_data)
# Optimal number of clusters
cat("Optimal Number of Clusters (GMM):", gmm_model$G, "\n")Optimal Number of Clusters (GMM): 2
# Step 3: Evaluate Clustering Quality
# Silhouette Score
silhouette_scores <- silhouette(gmm_model$classification, dist(pca_data))
avg_silhouette <- mean(silhouette_scores[, 3])
cat("Average Silhouette Score (GMM):", round(avg_silhouette, 2), "\n")Average Silhouette Score (GMM): 0.62
# Adjusted Rand Index (ARI)
if ("Category" %in% colnames(hepatitis_data)) {
actual_labels <- as.numeric(factor(hepatitis_data$Category)) # Replace 'Category' with your actual label column
ari <- adjustedRandIndex(actual_labels, gmm_model$classification)
cat("Adjusted Rand Index (ARI):", round(ari, 2), "\n")
} else {
cat("No actual labels provided for ARI calculation.\n")
}Adjusted Rand Index (ARI): 0.63
# Step 5: Add Cluster Assignments to Dataset
hepatitis_data$Cluster <- as.factor(gmm_model$classification)
# Step 6: Analyze Clusters
# Summary statistics for each cluster
cluster_summary <- hepatitis_data %>%
group_by(Cluster) %>%
summarise(across(where(is.numeric), mean, na.rm = TRUE))
# Display cluster summary
print(cluster_summary)# A tibble: 2 × 13
Cluster ...1 Age ALB ALP ALT AST BIL CHE CHOL CREA GGT
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 281. 46.6 42.4 66.5 26.1 28.0 8.79 8.46 5.46 78.7 29.2
2 2 545. 54.6 34.3 84.0 49.1 95.4 34.7 5.82 4.54 105. 132.
# ℹ 1 more variable: PROT <dbl>
# Enhanced GMM Clustering Visualization with Labels
ggplot(as.data.frame(pca_data), aes(x = PC1, y = PC2, color = as.factor(gmm_model$classification))) +
geom_point(size = 3, alpha = 0.7) + # Data points
stat_ellipse(aes(fill = as.factor(gmm_model$classification)), alpha = 0.2, geom = "polygon") + # Ellipses
scale_color_manual(values = c("#4CAF50", "#FF5722"), name = "Cluster",
labels = c("Cluster 1: Healthy/Low Risk", "Cluster 2: At-Risk/Diseased")) +
scale_fill_manual(values = c("#4CAF50", "#FF5722"), name = "Cluster",
labels = c("Cluster 1: Healthy/Low Risk", "Cluster 2: At-Risk/Diseased")) +
labs(
title = "Gaussian Mixture Model Clustering",
subtitle = "Principal Components and Identified Groups",
x = "Principal Component 1 (Explains 33.3% Variance)",
y = "Principal Component 2 (Explains 20.5% Variance)"
) +
theme_minimal(base_size = 15) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
legend.position = "right",
legend.title = element_text(face = "bold"),
legend.text = element_text(size = 10),
axis.title.x = element_text(size = 9),
axis.title.y = element_text(size = 9)
)# Scale the data
scaled_data <- scale(pca_data)
# k-NN Distance Plot
kNNdist <- kNNdistplot(scaled_data, k = 5) # k = minPts (adjust based on data)
abline(h = 1, col = "red", lty = 2) # Adjust the threshold manually# Apply DBSCAN with eps = 1.0 and minPts = 5 (rule of thumb: dimensions + 1)
dbscan_model <- dbscan(scaled_data, eps = 1, minPts = 5)
# Print cluster assignments
cat("Cluster Assignments:\n")Cluster Assignments:
table(dbscan_model$cluster)
0 1 2
13 598 4
# Calculate the percentage of noise
noise_percentage <- sum(dbscan_model$cluster == 0) / nrow(scaled_data) * 100
cat("Noise Percentage:", round(noise_percentage, 2), "%\n")Noise Percentage: 2.11 %
# Compute silhouette scores
silhouette_scores <- silhouette(dbscan_model$cluster, dist(scaled_data))
avg_silhouette <- mean(silhouette_scores[, 3])
# Print average silhouette score
cat("Average Silhouette Score (DBSCAN):", round(avg_silhouette, 2), "\n")Average Silhouette Score (DBSCAN): 0.65
cat("Cluster Sizes:\n")Cluster Sizes:
table(dbscan_model$cluster)
0 1 2
13 598 4
# Visualize DBSCAN clusters
ggplot(as.data.frame(pca_data), aes(x = PC1, y = PC2, color = as.factor(dbscan_model$cluster))) +
geom_point(size = 3, alpha = 0.7) +
scale_color_manual(
values = c("#999999", "#E41A1C", "#377EB8", "#4DAF4A", "#FF7F00"),
name = "Cluster",
labels = c("Noise", "Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4")
) +
labs(
title = "DBSCAN Clustering Results with eps = 1.0",
x = "Principal Component 1",
y = "Principal Component 2",
caption = "Noise is represented by Cluster 0"
) +
theme_minimal(base_size = 15) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
plot.subtitle = element_text(hjust = 0.5),
legend.position = "right",
legend.title = element_text(face = "bold"),
legend.text = element_text(size = 12)
)cat("DBSCAN Results Summary:\n")DBSCAN Results Summary:
cat("Number of Clusters:", length(unique(dbscan_model$cluster)) - 1, "\n") # Exclude noise clusterNumber of Clusters: 2
cat("Noise Percentage:", round(noise_percentage, 2), "%\n")Noise Percentage: 2.11 %
cat("Average Silhouette Score:", round(avg_silhouette, 2), "\n")Average Silhouette Score: 0.65
# Install and load required libraries
if (!requireNamespace("e1071",
quietly = TRUE)) {
install.packages("e1071")
}
if (!requireNamespace("caret",
quietly = TRUE)) {
install.packages("caret")
}
if (!requireNamespace("dplyr",
quietly = TRUE)) {
install.packages("dplyr")
}
library(e1071)
library(caret)
library(dplyr)
# Load the dataset
hepatitis <- read.csv("HepatitisC_Cleaned.csv")
# Preprocessing
# Encode 'Sex' as a factor
hepatitis$Sex <- as.factor(hepatitis$Sex)
# Convert 'Category' to a factor for multi-class classification
hepatitis$Category <- as.factor(hepatitis$Category)
# Split the data into training and testing sets
set.seed(123) # For reproducibility
trainIndex <- createDataPartition(hepatitis$Category, p = 0.8, list = FALSE)
trainData <- hepatitis[trainIndex, ]
testData <- hepatitis[-trainIndex, ]
# Identify numeric features
numeric_features <- names(which(sapply(trainData, is.numeric)))
# Scale numeric features
scaled_train <- trainData
scaled_test <- testData
for (col in numeric_features) {
# Get scaling parameters from the training set
col_mean <- mean(trainData[[col]], na.rm = TRUE)
col_sd <- sd(trainData[[col]], na.rm = TRUE)
# Scale training and test data
scaled_train[[col]] <- (trainData[[col]] - col_mean) / col_sd
scaled_test[[col]] <- (testData[[col]] - col_mean) / col_sd
}
# Train the SVM model with a radial basis function kernel
svm_model <- svm(Category ~ ., data = scaled_train, kernel = "radial", cost = 1, gamma = 0.1)
# Predict on the test set
svm_predictions <- predict(svm_model, newdata = scaled_test)
# Evaluate the model
conf_matrix <- confusionMatrix(svm_predictions, scaled_test$Category)
print("Confusion Matrix for SVM:")[1] "Confusion Matrix for SVM:"
print(conf_matrix)Confusion Matrix and Statistics
Reference
Prediction 0=Blood Donor 0s=suspect Blood Donor 1=Hepatitis
0=Blood Donor 106 0 0
0s=suspect Blood Donor 0 0 0
1=Hepatitis 0 0 3
2=Fibrosis 0 0 0
3=Cirrhosis 0 1 1
Reference
Prediction 2=Fibrosis 3=Cirrhosis
0=Blood Donor 0 0
0s=suspect Blood Donor 0 0
1=Hepatitis 0 0
2=Fibrosis 3 1
3=Cirrhosis 1 5
Overall Statistics
Accuracy : 0.9669
95% CI : (0.9175, 0.9909)
No Information Rate : 0.876
P-Value [Acc > NIR] : 0.0004865
Kappa : 0.8546
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: 0=Blood Donor Class: 0s=suspect Blood Donor
Sensitivity 1.000 0.000000
Specificity 1.000 1.000000
Pos Pred Value 1.000 NaN
Neg Pred Value 1.000 0.991736
Prevalence 0.876 0.008264
Detection Rate 0.876 0.000000
Detection Prevalence 0.876 0.000000
Balanced Accuracy 1.000 0.500000
Class: 1=Hepatitis Class: 2=Fibrosis Class: 3=Cirrhosis
Sensitivity 0.75000 0.75000 0.83333
Specificity 1.00000 0.99145 0.97391
Pos Pred Value 1.00000 0.75000 0.62500
Neg Pred Value 0.99153 0.99145 0.99115
Prevalence 0.03306 0.03306 0.04959
Detection Rate 0.02479 0.02479 0.04132
Detection Prevalence 0.02479 0.03306 0.06612
Balanced Accuracy 0.87500 0.87073 0.90362
# Install and load the GGally package if not already installed
if (!requireNamespace("GGally", quietly = TRUE)) {
install.packages("GGally")
}
library(GGally)
# Specify the features to include in the plot
selected_features <- c("Age", "ALB", "ALP", "ALT", "AST", "BIL")
# Filter the dataset for specific categories
filtered_data <- scaled_train %>%
filter(Category %in% c("1=Hepatitis", "2=Fibrosis", "3=Cirrhosis")) # Assuming "1" = Hepatitis, "2" = Fibrosis, "3" = Cirrhosis
# Parallel coordinates plot for selected features and categories
ggparcoord(data = filtered_data,
columns = which(names(filtered_data) %in% selected_features),
groupColumn = "Category",
scale = "std", alphaLines = 0.5) +
labs(title = "Parallel Coordinates Plot for Hepatitis, Fibrosis, and Cirrhosis",
x = "Features", y = "Scaled Values") +
theme_minimal(base_size = 14)Association rule using Apriori Algorithm
# 1. Data Preparation
hepatitis_categorized <- hepatitis %>%
mutate(
Age_Group = case_when(
Age < 35 ~ "Young",
Age < 50 ~ "Middle",
Age < 65 ~ "Senior",
TRUE ~ "Elderly"
),
ALT_Level = case_when(
ALT < 40 ~ "Normal",
ALT < 80 ~ "Mild_Elevation",
ALT < 200 ~ "Moderate_Elevation",
TRUE ~ "Severe_Elevation"
),
AST_Level = case_when(
AST < 40 ~ "Normal",
AST < 80 ~ "Mild_Elevation",
AST < 200 ~ "Moderate_Elevation",
TRUE ~ "Severe_Elevation"
),
Disease_Status = case_when(
Category == 0 ~ "Blood_Donor",
Category == 1 ~ "Suspect",
Category == 2 ~ "Hepatitis",
Category == 3 ~ "Fibrosis",
Category == 4 ~ "Cirrhosis"
)
)
# 2. Create transactions
hepatitis_trans <-
as(data.frame(select(hepatitis_categorized,
Age_Group, ALT_Level, AST_Level, Disease_Status)),
"transactions")
# 3. Apply Apriori algorithm
rules <- apriori(hepatitis_trans,
parameter = list(
support = 0.03,
confidence = 0.5,
minlen = 2,
maxlen = 4
))Apriori
Parameter specification:
confidence minval smax arem aval originalSupport maxtime support minlen
0.5 0.1 1 none FALSE TRUE 5 0.03 2
maxlen target ext
4 rules TRUE
Algorithmic control:
filter tree heap memopt load sort verbose
0.1 TRUE TRUE FALSE TRUE 2 TRUE
Absolute minimum support count: 18
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[12 item(s), 615 transaction(s)] done [0.00s].
sorting and recoding items ... [9 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 done [0.00s].
writing ... [25 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
# 4. Process rules
rules_pruned <-
rules[!is.redundant(rules)]
rules_sorted <-
sort(rules_pruned, by = "lift", decreasing = TRUE)
# 5. Create rules dataframe
rules_df <- data.frame(
lhs = labels(lhs(rules_sorted)),
rhs = labels(rhs(rules_sorted)),
support = quality(rules_sorted)$support,
confidence = quality(rules_sorted)$confidence,
lift = quality(rules_sorted)$lift
)
# 6. Visualization
# Scatter Plot
scatter_plot <- plot_ly(rules_df,
x = ~support,
y = ~confidence,
color = ~lift,
type = "scatter",
mode = "markers",
marker = list(size = 10),
text = ~paste("Rule:", lhs, "=>", rhs)) %>%
layout(
title = "Apriori Algorithm",
xaxis = list(title = "Support"),
yaxis = list(title = "Confidence"),
colorbar = list(title = "Lift")
)
# 7. Print Quality Assessment
cat("\nAssociation Rule Analysis Summary:")
Association Rule Analysis Summary:
cat("\nNumber of Rules:", nrow(rules_df))
Number of Rules: 21
cat("\nAverage Confidence (Accuracy):", round(mean(rules_df$confidence), 4))
Average Confidence (Accuracy): 0.7977
cat("\nAverage Lift:", round(mean(rules_df$lift), 4))
Average Lift: 1.0168
cat("\nAverage Support:", round(mean(rules_df$support), 4))
Average Support: 0.2092
# Print top 10 rules
cat("\n\nTop 10 Rules by Lift:\n")
Top 10 Rules by Lift:
head(rules_df[order(-rules_df$lift),
c("lhs", "rhs", "support", "confidence", "lift")], 10) lhs rhs support
1 {ALT_Level=Mild_Elevation,AST_Level=Normal} {Age_Group=Middle} 0.05691057
2 {ALT_Level=Mild_Elevation} {Age_Group=Middle} 0.07154472
3 {Age_Group=Young,AST_Level=Normal} {ALT_Level=Normal} 0.08780488
4 {AST_Level=Mild_Elevation} {Age_Group=Middle} 0.05040650
5 {Age_Group=Elderly,AST_Level=Normal} {ALT_Level=Normal} 0.03577236
6 {Age_Group=Young,ALT_Level=Normal} {AST_Level=Normal} 0.08780488
7 {Age_Group=Middle,ALT_Level=Normal} {AST_Level=Normal} 0.35934959
8 {Age_Group=Senior,AST_Level=Normal} {ALT_Level=Normal} 0.26829268
9 {ALT_Level=Normal} {AST_Level=Normal} 0.75121951
10 {AST_Level=Normal} {ALT_Level=Normal} 0.75121951
confidence lift
1 0.6034483 1.258036
2 0.5789474 1.206958
3 0.9473684 1.120445
4 0.5254237 1.095375
5 0.9166667 1.084135
6 0.9152542 1.082464
7 0.9132231 1.080062
8 0.9016393 1.066362
9 0.8884615 1.050777
10 0.8884615 1.050777
# Display plots
scatter_plotCHARM Algorithm
# Load required libraries
library(tidyverse)
library(arules)
library(arulesViz)
library(plotly)
# 1. Data Preprocessing
hepatitis_categorized <- hepatitis %>%
mutate(
Age_Group = case_when(
Age < 35 ~ "Young",
Age < 50 ~ "Middle",
Age < 65 ~ "Senior",
TRUE ~ "Elderly"
),
ALT_Level = case_when(
ALT < 40 ~ "ALT_Normal",
ALT < 80 ~ "ALT_Mild_Elevation",
ALT < 200 ~ "ALT_Moderate_Elevation",
TRUE ~ "ALT_Severe_Elevation"
),
AST_Level = case_when(
AST < 40 ~ " AST_Normal",
AST < 80 ~ " AST_Mild_Elevation",
AST < 200 ~ " AST_Moderate_Elevation",
TRUE ~ " AST_Severe_Elevation"
),
Disease_Status = case_when(
Category == 0 ~ "Blood_Donor",
Category == 1 ~ "Suspect",
Category == 2 ~ "Hepatitis",
Category == 3 ~ "Fibrosis",
Category == 4 ~ "Cirrhosis"
)
)
# 2. CHARM Implementation
charm_association_rules <- function(data, min_support = 0.05, min_confidence = 0.5) {
# Convert to transaction format
trans_matrix <- as.matrix(data)
n_trans <- nrow(trans_matrix)
# Find unique items
items <- unique(as.character(trans_matrix))
cat("Initial items:", length(items), "\n")
# Create vertical database (tid-lists)
tid_lists <- list()
frequent_items <- character()
# Find frequent single items
for(item in items) {
tids <- which(apply(trans_matrix, 1, function(x) item %in% x))
support <- length(tids)/n_trans
if(support >= min_support) {
tid_lists[[item]] <- tids
frequent_items <- c(frequent_items, item)
}
}
cat("Frequent items:", length(frequent_items), "\n")
# Store closed itemsets
closed_itemsets <- list()
# Find closed itemsets
find_closed_itemsets <- function(prefix = character(), prefix_tids = NULL,
remaining_items = frequent_items) {
for(i in seq_along(remaining_items)) {
item <- remaining_items[i]
new_tids <- if(is.null(prefix_tids)) tid_lists[[item]]
else intersect(prefix_tids, tid_lists[[item]])
support <- length(new_tids)/n_trans
if(support >= min_support) {
new_itemset <- c(prefix, item)
# Check if closed
is_closed <- TRUE
for(existing in names(closed_itemsets)) {
if(all(new_itemset %in% strsplit(existing, ",")[[1]]) &&
closed_itemsets[[existing]]$support == support) {
is_closed <- FALSE
break
}
}
if(is_closed) {
closed_itemsets[[paste(new_itemset, collapse=",")]] <<- list(
items = new_itemset,
support = support,
tids = new_tids
)
}
# Recursive call with remaining items
if(i < length(remaining_items)) {
find_closed_itemsets(new_itemset, new_tids,
remaining_items[(i+1):length(remaining_items)])
}
}
}
}
# Generate closed itemsets
cat("Generating closed itemsets...\n")
find_closed_itemsets()
cat("Found", length(closed_itemsets), "closed itemsets\n")
# Generate rules from closed itemsets
rules <- data.frame(
lhs = character(),
rhs = character(),
support = numeric(),
confidence = numeric(),
lift = numeric(),
stringsAsFactors = FALSE
)
# Generate rules
cat("Generating association rules...\n")
for(itemset_key in names(closed_itemsets)) {
itemset <- closed_itemsets[[itemset_key]]
if(length(itemset$items) >= 2) {
for(i in 1:(length(itemset$items)-1)) {
combs <- combn(itemset$items, i, simplify = FALSE)
for(lhs in combs) {
rhs <- setdiff(itemset$items, lhs)
# Calculate confidence
lhs_support <- max(sapply(names(closed_itemsets), function(key) {
if(all(lhs %in% closed_itemsets[[key]]$items))
return(closed_itemsets[[key]]$support)
return(0)
}))
if(lhs_support > 0) {
confidence <- itemset$support / lhs_support
if(confidence >= min_confidence) {
# Calculate lift
rhs_support <- max(sapply(names(closed_itemsets), function(key) {
if(all(rhs %in% closed_itemsets[[key]]$items))
return(closed_itemsets[[key]]$support)
return(0)
}))
lift <- confidence / rhs_support
rules[nrow(rules) + 1,] <- list(
lhs = paste(lhs, collapse=", "),
rhs = paste(rhs, collapse=", "),
support = itemset$support,
confidence = confidence,
lift = lift
)
}
}
}
}
}
}
return(rules)
}
# 3. Apply CHARM to dataset
selected_data <- select(hepatitis_categorized,
Age_Group, ALT_Level, AST_Level, Disease_Status)
rules <- charm_association_rules(selected_data, min_support = 0.05, min_confidence = 0.5)Initial items: 13
Frequent items: 10
Generating closed itemsets...
Found 24 closed itemsets
Generating association rules...
# 4. Visualize results
if(nrow(rules) > 0) {
# Scatter plot
scatter_plot <- plot_ly(rules,
x = ~confidence,
y = ~lift,
color = ~support,
type = "scatter",
mode = "markers",
text = ~paste("Rule:", lhs, "=>", rhs,
"<br>Support:", round(support, 4),
"<br>Confidence:", round(confidence, 4),
"<br>Lift:", round(lift, 4))
) %>%
layout(
title = "CHARM Algotrithm",
xaxis = list(title = "Confidence"),
yaxis = list(title = "Lift"),
colorbar = list(title = "Support")
)
# Print statistics
cat("\nAssociation Rule Analysis Results:\n")
cat("Total rules found:", nrow(rules), "\n")
cat("Average confidence:", round(mean(rules$confidence), 4), "\n")
cat("Average lift:", round(mean(rules$lift), 4), "\n")
cat("Average support:", round(mean(rules$support), 4), "\n")
# Print top rules
cat("\nTop 10 Rules by Lift:\n")
print(head(rules[order(-rules$lift), ], 10))
# Save results
write.csv(rules, "charm_rules.csv", row.names = FALSE)
} else {
cat("No rules found. Try adjusting the minimum support or confidence thresholds.\n")
}
Association Rule Analysis Results:
Total rules found: 23
Average confidence: 0.8
Average lift: 1.0279
Average support: 0.2454
Top 10 Rules by Lift:
lhs rhs support
12 ALT_Mild_Elevation, AST_Normal Middle 0.05691057
10 ALT_Mild_Elevation Middle 0.07154472
4 Young, AST_Normal ALT_Normal 0.08780488
14 AST_Mild_Elevation Middle 0.05040650
3 Young, ALT_Normal AST_Normal 0.08780488
8 Middle, ALT_Normal AST_Normal 0.35934959
18 Senior, AST_Normal ALT_Normal 0.26829268
2 Young ALT_Normal, AST_Normal 0.08780488
20 ALT_Normal AST_Normal 0.75121951
21 AST_Normal ALT_Normal 0.75121951
confidence lift
12 0.6034483 1.258036
10 0.5789474 1.206958
4 0.9473684 1.120445
14 0.5254237 1.095375
3 0.9152542 1.082464
8 0.9132231 1.080062
18 0.9016393 1.066362
2 0.7941176 1.057105
20 0.8884615 1.050777
21 0.8884615 1.050777
# Display plot
scatter_plot